import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
import plotly.express as px
from sqlalchemy import create_engine
import os
import pymysql
phi_house_data_vis = pd.read_csv('../data/phi_opa_properties_public.csv')
# original file (large file >300MB) can be download from
# https://www.opendataphilly.org/dataset/opa-property-assessments/resource/3084509c-82ad-4718-8ab2-474196eff98b
# This data visualzation is based the original uncleaned data from Philly government
# You will have some senses what kinds of housing related descirption words has the most frequency in the Phiadelphia
phi_house_data_vis.head()
phi_house_data_vis.columns
# Street Name:
plt.subplots(figsize = (25, 15))
wordcloud=WordCloud(background_color = 'white',
width = 1920,
height = 1080
).generate(" ".join(phi_house_data_vis.street_name))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
cloud = wordcloud.to_file('../jupyter_nb/Vis_Figures/WC_street_name.jpg')
#category_code_description
plt.subplots(figsize = (25, 15))
wordcloud=WordCloud(background_color = 'white',
width = 1920,
height = 1080
).generate(" ".join(phi_house_data_vis.category_code_description))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
cloud = wordcloud.to_file('../jupyter_nb/Vis_Figures/WC_category_code_description.jpg')
# owner name
plt.subplots(figsize = (25, 15))
wordcloud=WordCloud(background_color = 'white',
width = 1920,
height = 1080
).generate(" ".join(phi_house_data_vis.owner_1))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
cloud = wordcloud.to_file('../jupyter_nb/Vis_Figures/WC_owner_name.jpg')
# building_code
plt.subplots(figsize = (25, 15))
wordcloud=WordCloud(background_color = 'white',
width = 1920,
height = 1080
).generate(" ".join(phi_house_data_vis.building_code))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
cloud = wordcloud.to_file('../jupyter_nb/Vis_Figures/WC_building_code.jpg')
#House Price less than $ 200,000 (> 1000)
geo_phi_pp_vis = phi_house_data_vis[(
phi_house_data_vis.market_value >= 1000) & (phi_house_data_vis.market_value <= 200000)].plot(
kind='scatter', x='lng', y='lat', label='residental houses',c='market_value', cmap=plt.get_cmap('spring'),
colorbar=True, alpha=0.4, figsize=(14,8))
geo_phi_pp_vis.legend()
geo_phi_pp_vis.figure.savefig('../jupyter_nb/Vis_Figures/phi_house_market_value_less_200k.jpg')
geo_phi_pp_vis = phi_house_data_vis[(
phi_house_data_vis.market_value >= 200000) & (phi_house_data_vis.market_value <= 800000)].plot(
kind='scatter', x='lng', y='lat',
label='residental houses',c='market_value', cmap=plt.get_cmap('summer'),
colorbar=True, alpha=0.4, figsize=(14,8))
geo_phi_pp_vis.legend()
geo_phi_pp_vis.figure.savefig('../jupyter_nb/Vis_Figures/phi_house_market_value_200k_800k.jpg')
geo_phi_pp_vis = phi_house_data_vis[(
phi_house_data_vis.market_value > 800000) & (phi_house_data_vis.market_value <= 2000000)].plot(
kind='scatter', x='lng', y='lat',
label='residental houses',c='market_value', cmap=plt.get_cmap('autumn'),
colorbar=True, alpha=0.4, figsize=(14,8))
geo_phi_pp_vis.legend()
geo_phi_pp_vis.figure.savefig('../jupyter_nb/Vis_Figures/phi_house_market_value_800k_2M.jpg')
geo_phi_pp_vis = phi_house_data_vis[phi_house_data_vis.market_value >= 2000000].plot(kind='scatter', x='lng', y='lat',
label='residental houses',c='market_value', cmap=plt.get_cmap('winter'),
colorbar=True, alpha=0.4, figsize=(14,8))
geo_phi_pp_vis.legend()
geo_phi_pp_vis.figure.savefig('../jupyter_nb/Vis_Figures/phi_house_market_value_great_2M.jpg')
zip_code_gb_phi_df = pd.read_csv('../data/summary_data/zip_code_gb_phi_house_df_summary.csv')
zip_code_gb_phi_df.head()
# Let's look at the describe of the summary data
zip_code_gb_phi_df.describe().round(1)
zip_code_gb_phi_df.shape
zip_code_gb_phi_df.isnull().sum()
zip_code_gb_phi_df = zip_code_gb_phi_df.fillna(0)
zip_code_gb_phi_df.isnull().sum()
# Average Market Price Group By Zip Zode
fig = px.bar(zip_code_gb_phi_df, x = 'zip_code', y= 'avg_market_value', color = 'number_zip_code_count',
template = 'plotly_dark', title = 'Average Market Value - Zip Code - Number of Zip Code Count (color)')
fig.update_layout(xaxis_tickformat = ' ')
fig.update_layout(yaxis_title="avg_market_value (USD)")
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.update_xaxes(nticks=53)
fig.show()
fig.write_image('../jupyter_nb/Vis_Figures/Avg_Mmarket_By_Zip_Code.jpg')
# Save a copy of jpg file for Report
# Average Taxable Building Group By Zip Zode
fig = px.bar(zip_code_gb_phi_df, x = 'zip_code', y= 'avg_taxable_building', color = 'number_zip_code_count',
template = 'plotly_dark', title = 'Average Taxable Building - Zip Code - Number of Zip Code Count (color)')
fig.update_layout(xaxis_tickformat = ' ')
fig.update_layout(yaxis_title="avg_taxable_building (USD)")
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.update_xaxes(nticks=53)
fig.show()
fig.write_image('../jupyter_nb/Vis_Figures/Avg_Taxable_Building_By_Zip_Code.jpg')
# Average Sale Price (publica records) Group By Zip Zode
fig = px.bar(zip_code_gb_phi_df, x = 'zip_code', y= 'avg_sale_price', color = 'number_zip_code_count',
template = 'plotly_dark',
title = 'Average Sales Price (Public Properties) - Zip Code - Number of Zip Code Count')
fig.update_layout(xaxis_tickformat = ' ')
fig.update_layout(yaxis_title="avg_sales_price (USD)")
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.update_xaxes(nticks=53)
fig.show()
fig.write_image('../jupyter_nb/Vis_Figures/Avg_sales_Price_By_Zip_Code.jpg')
# Average Market Sales Price 2017 - 2020 Group By Zip Zode
fig = px.bar(zip_code_gb_phi_df, x = 'zip_code', y= 'avg_sales_price_2017_to_2020_market_sale', color = 'ave_sale_count_2017_to_2020_market_sale',
template = 'plotly_white',
title = 'Melissa Agent Average Market Sales Price - Zip Code - Melissa Agent Average Market Sales Count (color)')
fig.update_layout(xaxis_tickformat = ' ')
fig.update_layout(yaxis_title="avg_sales_price_2017_to_2020 mkt (USD)")
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.update_xaxes(nticks=53)
fig.show()
fig.write_image('../jupyter_nb/Vis_Figures/Avg_sales_Price_Melissa_By_Zip_Code.jpg')
# 3D scatter plot (major houses were built before 1955)
fig = px.scatter(zip_code_gb_phi_df, x="avg_year_built", y="avg_market_value", color="zip_code",
size='number_zip_code_count', template = 'ggplot2',
title = 'Average Market Value - Average Year Built - Zip Code (color) ')
fig.update_layout(xaxis_tickformat = ' ')
fig.update_layout(yaxis_title="avg_market_value (USD)")
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.show()
fig.write_image('../jupyter_nb/Vis_Figures/Avg_market_value_By_year_built_scatter.jpg')
fig = px.scatter(zip_code_gb_phi_df, x="avg_year_built", y="avg_sale_price", color="zip_code",
size='number_zip_code_count', template = 'seaborn',
title = 'Average Sales Price (Public Properties) - Average Year Built - Zip Code (color) ')
fig.update_layout(xaxis_tickformat = ' ')
fig.update_layout(yaxis_title="avg_market_value (USD)")
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.show()
fig.write_image('../jupyter_nb/Vis_Figures/Avg_market_sale_price_pp_By_year_built_scatter.jpg')
# 4D plot
fig = px.scatter_3d(zip_code_gb_phi_df, x = 'zip_code', y = 'avg_taxable_building', z='avg_year_built',
color = 'avg_market_value', template = 'plotly_dark',
title = 'x: Zip Code - y: Average Taxable Building(USD) - z: Average Year Built - color: Average Market Value(USD)')
fig.show()
fig.write_image('../jupyter_nb/Vis_Figures/Avg_market_value_avg_yb_avg_tb_By_Zipcode_scatter_4d.jpg')
# More 4d plot on different parameter
fig = px.scatter_3d(zip_code_gb_phi_df, x = 'avg_number_bedrooms', y = 'avg_livable_area', z='avg_number_stories',
color = 'avg_market_value', template = 'plotly_dark', hover_data = ['zip_code'],
title = 'x: Avg Number Bedrooms - y: Avg livable Area - z: Avg Number Stories - color: Avg Market Value')
fig.show()
fig.write_image('../jupyter_nb/Vis_Figures/Avg_market_value_avg_stories_avg_la_avg_bds_scatter_4d.jpg')
fig = px.scatter_3d(zip_code_gb_phi_df, x = 'healthy_corner_stores_count', y = 'schools_count', z='parks_recreation_count',
color = 'avg_market_value', template = 'plotly', hover_data = ['zip_code'],
title = 'Average Market Value - Parks Recreation Count - Schools Counts - Healthy Corner Store Count')
fig.show()
fig.write_image('../jupyter_nb/Vis_Figures/Avg_market_value_avg_pr_cnt_avg_schl_cnt_avg_hcs_scatter_4d.jpg')
# 5D plot Demo
fig = px.scatter_3d(zip_code_gb_phi_df, x='schools_count', y = 'health_centers_count', z = 'parks_recreation_count',
size='avg_market_value', color='zip_code', template = 'plotly_dark',
title = 'x: Schools Cnt - y: Health Centers Cnt - z: Parks Recreation Cnt - size: Average Mkt Value - color: Zip Code')
fig.show()
fig.write_image('../jupyter_nb/Vis_Figures/Avg_market_value_schl_cnt_hcs_count_pr_cnt_avg_market_value_zipcode_scatter_5d.jpg')
# After Data Visualization, the summary data will be saved into the mysql database.
conn = create_engine("mysql+pymysql://" + os.environ.get("mysql_user") + ":" + os.environ.get("mysql_key") + "@localhost:3306/data_group_project")
df = pd.read_csv('../data/summary_data/zip_code_gb_phi_house_df_summary.csv', delimiter=',')
df.to_sql(name='zip_code_gb_phi_house_df_summary', con=conn, schema='data_group_project', if_exists='replace')